Install and load necessary packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(magrittr)
library(readr)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ MASS::select() masks dplyr::select()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Load the data
data <- read_csv("house-prices-advanced-regression-techniques/train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Filter the data for specific neighborhoods & omit NA
filtered_data <- data %>%
filter(Neighborhood %in% c("NAmes", "Edwards", "BrkSide")) %>%
drop_na(SalePrice, GrLivArea, Neighborhood)
Summary statistics
summary_stats <- summary(filtered_data[c("SalePrice", "GrLivArea")])
summary_stats
## SalePrice GrLivArea
## Min. : 39300 Min. : 334
## 1st Qu.:116000 1st Qu.:1003
## Median :135500 Median :1200
## Mean :138062 Mean :1302
## 3rd Qu.:155000 3rd Qu.:1496
## Max. :345000 Max. :5642
Plotting distributions
# Histogram of Sale Prices
ggplot(filtered_data, aes(x = SalePrice)) + geom_histogram(bins = 30,fill = "blue",alpha = 0.7) +
ggtitle("Distribution of Sale Prices")

# Histogram of Living Area
ggplot(filtered_data, aes(x = GrLivArea)) + geom_histogram(bins = 30, fill = "red",alpha = 0.7) +
ggtitle("Distribution of Living Area")

# SalePrice vs GrLivArea scatterplot by Neighborhood
ggplot(filtered_data, aes(x = GrLivArea, y = SalePrice)) +
geom_point() +
facet_wrap(~ Neighborhood) +
labs(title = "SalePrice vs GrLivArea in Selected Neighborhoods",
x = "Living Area (GrLivArea)", y = "Sale Price")

# SalePrice vs GrLivArea Scatterplot
ggplot(filtered_data, aes(x = GrLivArea, y = SalePrice, color = Neighborhood)) +
geom_point() +
labs(title = "SalePrice vs GrLivArea in Selected Neighborhoods",
x = "Living Area", y = "Sale Price") +
theme_minimal()

Influence measures
influence_measures <- cooks.distance(model)
# Plot Cook's distance
plot(influence_measures,
type = "h",
main = "Cook's Distance",
ylab = "Cook's Distance")

Identify high leverage points
influential_points <-
which(influence_measures > (4/383))
influential_points
## 19 48 64 70 90 131 140 157 164 169 180 190 205 227 234 240 250 302 314 322
## 19 48 64 70 90 131 140 157 164 169 180 190 205 227 234 240 250 302 314 322
## 339 360 372
## 339 360 372
Internal CV Press
# Function to calculate PRESS
calculate_press <- function(model, data) {
n <- nrow(data)
press <- 0
for (i in 1:n) {
# Fit model without the ith observation
model_loo <- update(model, subset = -i)
# Predict the ith observation
pred <- predict(model_loo, data[i, , drop = FALSE])
# Calculate squared prediction error and add to PRESS
press <- press + (data$LogSalePrice[i] - pred)^2
}
return(press)
}
# Calculate PRESS statistics
press_original <- calculate_press(model, refined_data)
press_logSP <- calculate_press(logSP_model, refined_data)
press_logLA <- calculate_press(logLA_model, refined_data)
press_bothlog <- calculate_press(logboth_model, refined_data)
Compare Adjusted R- squared and Internal CV Press for models
cat("Adjusted R-squared for the original model:", modelsum$adj.r.squared, "\n")
## Adjusted R-squared for the original model: 0.4400466
cat("Adjusted R-squared for the LogSalePrice model:", sumlogSP_model$adj.r.squared, "\n")
## Adjusted R-squared for the LogSalePrice model: 0.5012172
cat("Adjusted R-squared for the LogLivingArea model:", sumlogLA_model$adj.r.squared, "\n")
## Adjusted R-squared for the LogLivingArea model: 0.5044703
cat("Adjusted R-squared for the both log model:", sumlogboth_model$adj.r.squared, "\n")
## Adjusted R-squared for the both log model: 0.5185443
cat("PRESS for original model:", press_original, "\n")
## PRESS for original model: 6.782684e+12
cat("PRESS for LogSalePrice model:", press_logSP, "\n")
## PRESS for LogSalePrice model: 11.17583
cat("PRESS for LogLivingArea model:", press_logLA, "\n")
## PRESS for LogLivingArea model: 6.573203e+12
cat("PRESS for log both model:", press_bothlog, "\n")
## PRESS for log both model: 10.76139
Parameters
sumlogboth_model
##
## Call:
## lm(formula = LogSalePrice ~ LogGrLivArea * Neighborhood, data = refined_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69620 -0.09036 0.02169 0.10294 0.45985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.09842 0.47243 12.909 < 2e-16 ***
## LogGrLivArea 0.79242 0.06737 11.762 < 2e-16 ***
## NeighborhoodEdwards 2.17461 0.68907 3.156 0.00174 **
## NeighborhoodNAmes 2.65458 0.57104 4.649 4.72e-06 ***
## LogGrLivArea:NeighborhoodEdwards -0.31346 0.09781 -3.205 0.00148 **
## LogGrLivArea:NeighborhoodNAmes -0.35654 0.08107 -4.398 1.45e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1713 on 354 degrees of freedom
## Multiple R-squared: 0.5252, Adjusted R-squared: 0.5185
## F-statistic: 78.33 on 5 and 354 DF, p-value: < 2.2e-16
model_CI <- confint(logboth_model, level = 0.95)
model_CI
## 2.5 % 97.5 %
## (Intercept) 5.1692864 7.0275447
## LogGrLivArea 0.6599205 0.9249283
## NeighborhoodEdwards 0.8194168 3.5298045
## NeighborhoodNAmes 1.5315274 3.7776375
## LogGrLivArea:NeighborhoodEdwards -0.5058356 -0.1210931
## LogGrLivArea:NeighborhoodNAmes -0.5159846 -0.1971023